林嶔 (Lin, Chin)
Lesson 20
– 那就是假設兩個變項對outcome存在預測能力,但分開看完全沒有關係,同時合一起看才有關係,這樣照決策樹運行的邏輯,不就無法處理了?
– 換個說法,就是這兩個變項存在交互作用
x1 = rep(0:1, 500)
x2 = rep(0:1, rep(500, 2))
y = x1 + x2 - 2*x1*x2
y = as.factor(y)
table(x1, y)
## y
## x1 0 1
## 0 250 250
## 1 250 250
table(x2, y)
## y
## x2 0 1
## 0 250 250
## 1 250 250
table(x1, y, x2)
## , , x2 = 0
##
## y
## x1 0 1
## 0 250 0
## 1 0 250
##
## , , x2 = 1
##
## y
## x1 0 1
## 0 0 250
## 1 250 0
library(party)
tree.model = ctree(y~x1+x2)
plot(tree.model)
所以,Tree-based model中有個重要的缺陷,那就是當存在「特徵相依性」時,他們將會失去作戰能力。
但是,我知道要兩個變項一起使用,並且同時考慮其交互作用,但當變項數目一多,這樣的組合豈不是無法窮盡?
set.seed(0)
x1 = rnorm(300)
x2 = rnorm(300)
lr1 = x1^2 + x2^2
y = lr1 > mean(lr1)
plot(x1, x2, col = (y + 1)*2, pch = 19)
library(rgl)
plot3d(x = x1,
y = x2,
z = x1*x2,
col = (y + 1)*2, size = 3, main="3D Scatterplot")
You must enable Javascript to view this page properly.
plot(x1^2, x2^2, col = (y + 1)*2, pch = 19)
– 在一個2維平面中,他希望找到一條線能完美的分割紅點與藍點
x1 = c(0, 2, 2, 3)
x2 = c(0, 2, 0, 0)
y = c(1, 1, -1, -1)
plot(x1, x2, col = y + 3, pch = 19, cex = 3)
– 我們當然一眼就能看出有好多條線都能輕易完美分割這四個點,舉例來說,x2 = -0.5 + 0.5 × x1能幫我們輕易的切開紅藍點
plot(x1, x2, col = y + 3, pch = 19, cex = 3)
abline(a = -0.5, b = 0.5, lwd = 2, lty = 1)
– 但這樣是不夠的,儘管我們不清楚原因,但你應該覺得x2 = -1 + 1 × x1這條線切得更好吧!
plot(x1, x2, col = y + 3, pch = 19, cex = 3)
abline(a = -1, b = 1, lwd = 2, lty = 1)
plot(x1, x2, col = y + 3, pch = 19, cex = 3)
abline(a = -1, b = 1, lwd = 2, lty = 1)
abline(a = 0, b = 1, lwd = 2, lty = 2)
abline(a = -2, b = 1, lwd = 2, lty = 2)
我們想要找到一條線,能完美切割這些點(約束條件)
這條線離最接近的點要最遠(取極大值)
– 在這樣的條線下,w0 = 1;w1 = -1;w2 = 1,這能滿足這個例子上我們要的解
distance.func = function (x1, x2, w0, w1, w2) {
dist = 1/sqrt(w1^2 + w2^2) * abs(w0 + w1 * x1 + w2 * x2)
return(dist)
}
distance.func(0, 0, w0 = 1, w1 = -1, w2 = 1)
## [1] 0.7071068
distance.func(2, 2, w0 = 1, w1 = -1, w2 = 1)
## [1] 0.7071068
distance.func(2, 0, w0 = 1, w1 = -1, w2 = 1)
## [1] 0.7071068
distance.func(3, 0, w0 = 1, w1 = -1, w2 = 1)
## [1] 1.414214
distance.func = function (x1, x2, y, w0, w1, w2) {
dist = 1/sqrt(w1^2 + w2^2) * y * (w0 + w1 * x1 + w2 * x2)
return(dist)
}
distance.func(0, 0, 1, w0 = 1, w1 = -1, w2 = 1)
## [1] 0.7071068
distance.func(2, 2, 1, w0 = 1, w1 = -1, w2 = 1)
## [1] 0.7071068
distance.func(2, 0, -1, w0 = 1, w1 = -1, w2 = 1)
## [1] 0.7071068
distance.func(3, 0, -1, w0 = 1, w1 = -1, w2 = 1)
## [1] 1.414214
– 由於我們令了兩條虛線分別是w0 + w1 × x1 + w2 × x2 = 1與w0 + w1 × x1 + w2 × x2 = -1,所以所有點離粗線的距離至少都要滿足下列方程式
– 既然他都大於等於1了,那要最大化距離方程就跟他無關了,我們只要最大化…
– 把式子改寫一下,問題變得簡單了
– 我們必須改用二次規劃求解,這個部分我們讓套件幫助我們渡過難關,但我們需要把我們的問題轉化:
library(quadprog)
n.sample = 4
n.weight = 3
small.value = 5e-6
Q.matrix = matrix(small.value, nrow = n.weight, ncol = n.weight)
Q.matrix = Q.matrix + small.value*diag(n.weight)
A.matrix = rbind(rep(1, n.sample)*y, x1*y, x2*y)
fit = solve.QP(Dmat = Q.matrix, dvec = rep(0, n.weight), Amat = A.matrix, bvec = rep(1, n.sample))
fit$solution
## [1] 1 -1 1
– 當答案解出來之後,我們就能利用這個解來畫線了
COEF = fit$solution
A0 = -COEF[1]/COEF[3]
A1 = A0 + 1/COEF[3]
A2 = A0 - 1/COEF[3]
B = -COEF[2]/COEF[3]
plot(x1, x2, col = y + 3, pch = 19, cex = 3)
abline(a = A0, b = B, lwd = 2, lty = 1)
abline(a = A1, b = B, lwd = 2, lty = 2)
abline(a = A2, b = B, lwd = 2, lty = 2)
– 我們把它寫成一個完整的求解函數
mysvm = function (x1, x2, y) {
require(quadprog)
n.sample = length(x1)
n.weight = 3
small.value = 5e-6
Q.matrix = matrix(small.value, nrow = n.weight, ncol = n.weight);diag(Q.matrix) = 1; Q.matrix[1,1] = small.value
A.matrix = rbind(rep(1, n.sample)*y, x1*y, x2*y)
fit = solve.QP(Dmat = Q.matrix, dvec = rep(0, n.weight), Amat = A.matrix, bvec = rep(1, n.sample))
COEF = fit$solution
A0 = -COEF[1]/COEF[3]
A1 = A0 + 1/COEF[3]
A2 = A0 - 1/COEF[3]
B = -COEF[2]/COEF[3]
plot(x1, x2, col = y + 3, pch = 19)
abline(a = A0, b = B, lwd = 2, lty = 1)
abline(a = A1, b = B, lwd = 2, lty = 2)
abline(a = A2, b = B, lwd = 2, lty = 2)
}
mysvm(x1 = c(0, 2, 2, 3),
x2 = c(0, 2, 0, 0),
y = c(1, 1, -1, -1))
mysvm(x1 = c(0, 2, 3, 4, 5),
x2 = c(0, 2, 0, 0, 3),
y = c(1, 1, -1, -1, -1))
data(iris)
sub.iris = iris[1:100,]
X1 = sub.iris[,1]
X2 = sub.iris[,2]
Y = as.integer(sub.iris[,5]) * 2 - 3
mysvm(x1 = X1, x2 = X2, y = Y)
glm.model = glm((Y>0)~X1+X2, family = "binomial")
COEF = glm.model$coefficients
A = -COEF[1]/COEF[3]
B = -COEF[2]/COEF[3]
plot(X1, X2, col = Y + 3, pch = 19)
abline(a = A, b = B, col = "darkgreen")
– 在進入下一個部份之前,我們要先轉換一下上述這個「有條件的」最佳化問題,把他的求解的式子加入一個「拉格朗日乘數(Lagrange multiplier)」,讓問題成為一個「無條件的」最佳化問題。
– 接著我們把我們的優化目標修正為:
– 然後我們可以將問題轉換為:
– 在給定一個特殊條件:KKT條件,這樣我們能把W與b給消除:
#set.seed(0)
x1 = c(0, 2, 2, 3)
x2 = c(0, 2, 0, 0)
y = c(1, 1, -1, -1)
library(quadprog)
n.sample = 4
small.value = 5e-6
X = cbind(x1, x2)
Q.matrix = (y%*%t(y))*(X%*%t(X))
Q.matrix = Q.matrix + small.value*diag(n.sample)
A.matrix = t(rbind(matrix(y, ncol = n.sample), diag(n.sample)))
fit = solve.QP(Dmat = Q.matrix, dvec = rep(1, n.sample), Amat = A.matrix, bvec = rep(0, n.sample + 1), meq = 1, factorized = FALSE)
qpsol <- fit$solution
print(qpsol)
## [1] 0.4999981 0.4999981 0.9999963 0.0000000
– 不要忘記我們剛剛在解極值時給定的KKT條件,讓我們回顧一下關係式
– 這裡要注意一點,若拉格朗日乘數等於0,在計算b時不能使用,而且也完全不影響到W向量的結果。
– 我們把剩下的這些拉格朗日乘數大於0的點稱為「支持向量」,他們其實就是位於最靠近寬分界的那些點,這下我們終於了解為什麼這個方法要稱為「支持向量機」了
findCoefs <- function(a, y, X){
nonzero <- abs(a) > 5e-6
W <- rowSums(sapply(which(nonzero), function(i) a[i]*y[i]*X[i,]))
b <- mean(sapply(which(nonzero), function(i) y[i]-X[i,]%*%W))
result <- c(b, W)
names(result) = c("w0", "w1", "w2")
return(result)
}
coefs = findCoefs(qpsol, y, X)
print(coefs)
## w0 w1 w2
## 0.9999975 -0.9999963 0.9999963
A = -coefs[1]/coefs[3]
B = -coefs[2]/coefs[3]
plot(x1, x2, col = y + 3, pch = 19)
abline(a = A, b = B, lwd = 2, lty = 1)
library(e1071)
x1 = c(0, 2, 2, 3)
x2 = c(0, 2, 0, 0)
y = c(1, 1, -1, -1)
svm.model = svm(y ~ x1 + x2, kernel = "linear", scale = FALSE, type = "C-classification", cost = 1e5)
– 這就是「支持向量」
svm.model$SV
## x1 x2
## 1 0 0
## 2 2 2
## 3 2 0
– 這是每個「支持向量」的拉格朗日乘數乘以標籤
svm.model$coefs
## [,1]
## [1,] 0.5
## [2,] 0.5
## [3,] -1.0
– 這是負數截距
svm.model$rho
## [1] -1
W.vector = rowSums(sapply(1:length(svm.model$coefs), function(i) svm.model$coefs[i]*svm.model$SV[i,]))
w0 = -svm.model$rho
w1 = W.vector[1]
w2 = W.vector[2]
A0 = -w0/w2
A1 = A0 + 1/w2
A2 = A0 - 1/w2
B = -w1/w2
plot(x1, x2, col = y + 3, pch = 19, cex = 3)
abline(a = A0, b = B, lwd = 2, lty = 1)
abline(a = A1, b = B, lwd = 2, lty = 2)
abline(a = A2, b = B, lwd = 2, lty = 2)
svm.model$decision.values
## 1/-1
## 1 1
## 2 1
## 3 -1
## 4 -2
data(iris)
sub.iris = iris[1:100,c(1, 2, 5)]
sub.iris[,3] = as.factor(as.character(sub.iris[,3]))
X1 = sub.iris[,1]
X2 = sub.iris[,2]
Y = as.integer(sub.iris[,3])
svm.model = svm(Species ~ Sepal.Length + Sepal.Width, data = sub.iris, kernel = "linear", scale = FALSE, type = "C-classification", cost = 1e5)
W.vector = rowSums(sapply(1:length(svm.model$coefs), function(i) svm.model$coefs[i]*svm.model$SV[i,]))
w0 = -svm.model$rho
w1 = W.vector[1]
w2 = W.vector[2]
A0 = -w0/w2
A1 = A0 + 1/w2
A2 = A0 - 1/w2
B = -w1/w2
plot(X1, X2, col = Y * 2 + 2, pch = 19)
abline(a = A0, b = B, lwd = 2, lty = 1)
abline(a = A1, b = B, lwd = 2, lty = 2)
abline(a = A2, b = B, lwd = 2, lty = 2)
predict(svm.model, data.frame(Sepal.Length = 5, Sepal.Width = 4))
## 1
## setosa
## Levels: setosa versicolor
predict(svm.model, data.frame(Sepal.Length = 6, Sepal.Width = 3))
## 1
## versicolor
## Levels: setosa versicolor
plot(svm.model, sub.iris)
– 關鍵是你如何應用「w0、w1、w2」
– 提示:先想想「decision.values」是怎樣算出來的
data(iris)
sub.iris = iris[1:100,c(1, 2, 5)]
sub.iris[,3] = as.factor(as.character(sub.iris[,3]))
X1 = sub.iris[,1]
X2 = sub.iris[,2]
Y = as.integer(sub.iris[,3])
svm.model = svm(Species ~ Sepal.Length + Sepal.Width, data = sub.iris, kernel = "linear", scale = FALSE, type = "C-classification", cost = 1e5)
W.vector = rowSums(sapply(1:length(svm.model$coefs), function(i) svm.model$coefs[i]*svm.model$SV[i,]))
w0 = -svm.model$rho
w1 = W.vector[1]
w2 = W.vector[2]
predict(svm.model, data.frame(Sepal.Length = 5, Sepal.Width = 4), decision.values = TRUE)
## 1
## setosa
## attr(,"decision.values")
## setosa/versicolor
## 1 6.791608
## Levels: setosa versicolor
– 而「Q.matrix」所需要的其實是z內積後的結果,而我們原始的x先「內積」再「維度擴增運算」,和先「維度擴增運算」再「內積」,其實答案會完全一樣!
– 我們用「二次多項式轉換」來進行比較
– 注意,這個Kernel function是為了帶領大家快速理解而設計,並非套件「e1071」中SVM函數內真實使用的polynomial kernel function
x1 = c(0, 2, 2, 3)
x2 = c(0, 2, 0, 0)
y = c(1, 1, -1, -1)
X = cbind(x1, x2)
Z = cbind(x1, x2, x1^2, x1*x2, x2*x1, x2^2)
Q.matrix_1 = (y%*%t(y))*(Z%*%t(Z))
Q.matrix_1
## [,1] [,2] [,3] [,4]
## [1,] 0 0 0 0
## [2,] 0 72 -20 -42
## [3,] 0 -20 20 42
## [4,] 0 -42 42 90
X.DOT = (X%*%t(X))
Q.matrix_2 = (y%*%t(y))*(X.DOT + X.DOT^2)
Q.matrix_2
## [,1] [,2] [,3] [,4]
## [1,] 0 0 0 0
## [2,] 0 72 -20 -42
## [3,] 0 -20 20 42
## [4,] 0 -42 42 90
x1 = c(0, 2, 2, 3)
x2 = c(0, 2, 0, 0)
y = c(1, 1, -1, -1)
library(quadprog)
n.sample = 4
small.value = 5e-6
X = cbind(x1, x2)
Z = cbind(x1, x2, x1^2, x1*x2, x2*x1, x2^2)
X.DOT = (X%*%t(X))
Q.matrix = (y%*%t(y))*(X.DOT + X.DOT^2)
Q.matrix = Q.matrix + small.value*diag(n.sample)
A.matrix = t(rbind(matrix(y, ncol = n.sample), diag(n.sample)))
fit = solve.QP(Dmat = Q.matrix, dvec = rep(1, n.sample), Amat = A.matrix, bvec = rep(0, n.sample + 1), meq = 1, factorized = FALSE)
qpsol <- fit$solution
findCoefs_1 <- function(a, y, Z){
nonzero <- abs(a) > 5e-6
W <- rowSums(sapply(which(nonzero), function(i) a[i]*y[i]*Z[i,]))
b <- mean(sapply(which(nonzero), function(i) y[i]-Z[i,]%*%W))
result <- c(b, W)
names(result) = c("w0", "w1", "w2", "w11", "w12", "w21", "w22")
return(result)
}
coefs = findCoefs_1(qpsol, y, Z)
print(coefs)
## w0 w1 w2 w11 w12 w21
## 0.99999950 -0.19999988 0.07692304 -0.39999976 0.15384609 0.15384609
## w22
## 0.15384609
– 到目前為止,Kernel function給我們的幫助僅僅是在求解拉格朗日乘數時避免了維度擴增,但之後我們仍然需要維度擴增才能進行預測
set.seed(0)
x1 = rnorm(300)
x2 = rnorm(300)
lr1 = x1^2 + x2^2
y = (lr1 > mean(lr1)) * 2 - 1
library(quadprog)
n.sample = 300
small.value = 5e-6
X = cbind(x1, x2)
Z = cbind(x1, x2, x1^2, x1*x2, x2*x1, x2^2)
X.DOT = (X%*%t(X))
Q.matrix = (y%*%t(y))*(X.DOT + X.DOT^2)
Q.matrix = Q.matrix + small.value*diag(n.sample)
A.matrix = t(rbind(matrix(y, ncol = n.sample), diag(n.sample)))
fit = solve.QP(Dmat = Q.matrix, dvec = rep(1, n.sample), Amat = A.matrix, bvec = rep(0, n.sample + 1), meq = 1, factorized = FALSE)
qpsol <- fit$solution
findCoefs_1 <- function(a, y, Z){
nonzero <- abs(a) > 5e-6
W <- rowSums(sapply(which(nonzero), function(i) a[i]*y[i]*Z[i,]))
b <- mean(sapply(which(nonzero), function(i) y[i]-Z[i,]%*%W))
result <- c(b, W)
names(result) = c("w0", "w1", "w2", "w11", "w12", "w21", "w22")
return(result)
}
coefs = findCoefs_1(qpsol, y, Z)
print(coefs)
## w0 w1 w2 w11 w12
## -16.18894664 -0.13535444 -0.39142162 8.78193290 0.02013271
## w21 w22
## 0.02013271 8.73392243
x2.function = function (x1, w0, w1, w2, w3, w4, w5, w6) {
left.value = -(w0 + w1*x1 + w3*x1^2 - (w2+w4+w5)^2/(4*w6))/w6
sqrt.left.value = sqrt(left.value)
answer = sqrt.left.value - (w2+w4+w5)/(2*w6)
return(answer)
}
x1.seq = seq(-3, 3, by = 0.01)
x2.seq = x2.function(x1.seq, w0 = coefs[1],
w1 = coefs[2], w2 = coefs[3], w3 = coefs[4],
w4 = coefs[5], w5 = coefs[6], w6 = coefs[7])
real.x1.seq = c(x1.seq, x1.seq[601:1])
real.x2.seq = c(x2.seq, -x2.seq[601:1])
real.x1.seq = real.x1.seq[!is.na(real.x2.seq)]
real.x2.seq = real.x2.seq[!is.na(real.x2.seq)]
plot(x1, x2, col = y + 3, pch = 19)
lines(real.x1.seq, real.x2.seq, lwd = 2)
– 我們先回憶一下,我們是如何求解「decision.values」的:
coefs = findCoefs_1(qpsol, y, Z)
decision.values_1 = cbind(rep(1, 300), Z)%*%coefs
head(decision.values_1, 6)
## [,1]
## [1,] -2.174625
## [2,] 2.986107
## [3,] 2.899798
## [4,] 30.621854
## [5,] -14.002210
## [6,] 18.937735
– 先讓我們整合一下函數
predict_1 <- function(a, y, Z){
nonzero <- abs(a) > 5e-6
W <- rowSums(sapply(which(nonzero), function(i) a[i]*y[i]*Z[i,]))
b <- mean(sapply(which(nonzero), function(i) y[i]-Z[i,]%*%W))
result = Z%*%W + b
return(result)
}
decision.values_2 = predict_1(qpsol, y, Z)
all.equal(decision.values_1, decision.values_2)
## [1] TRUE
predict_2 <- function(a, y, Z, new.Z){
nonzero <- abs(a) > 5e-6
support.a = a[nonzero]
support.y = y[nonzero]
support.Z = Z[nonzero,] #svm.model$SV
coefs <- support.a*support.y #svm.model$coefs
result_0 <- rowSums(sapply(1:length(support.a),
function (i) {coefs[i]*support.Z[i,]%*%t(support.Z)}))
result_1 <- rowSums(sapply(1:length(support.a),
function (i) {coefs[i]*support.Z[i,]%*%t(new.Z)}))
b <- mean(support.y - result_0) #-svm.model$rho
return(result_1+b)
}
decision.values_3 = predict_2(qpsol, y, Z, Z)
all.equal(as.numeric(decision.values_1), decision.values_3)
## [1] TRUE
predict_3 <- function(a, y, X, new.X){
nonzero <- abs(a) > 5e-6
support.a = a[nonzero]
support.y = y[nonzero]
support.X = X[nonzero,] #svm.model$SV
coefs <- support.a*support.y #svm.model$coefs
result_0 <- rowSums(sapply(1:length(support.a),
function (i) {
X.DOT = support.X[i,]%*%t(support.X)
#Our kernel function
Kernel.X.DOT = (X.DOT+X.DOT^2)
coefs[i]*Kernel.X.DOT
}))
result_1 <- rowSums(sapply(1:length(support.a),
function (i) {
X.DOT = support.X[i,]%*%t(new.X)
#Our kernel function
Kernel.X.DOT = (X.DOT+X.DOT^2)
coefs[i]*Kernel.X.DOT
}))
b <- mean(support.y - result_0) #-svm.model$rho
return(result_1+b)
}
decision.values_4 = predict_3(qpsol, y, X, X)
all.equal(as.numeric(decision.values_1), decision.values_4)
## [1] TRUE
– 唯一的缺陷是,我們再也求不出各項系數了,但我們到現在為止終於明白為什麼套件「e1071」裡面所給的輸出參數是「SV」、「coefs」、「rho」了
– 套件「e1071」中SVM函數內真實使用的polynomial kernel function其實是長成這樣:(gamma × u’.v + coef0)^degree (請參考函數說明)
set.seed(0)
x1 = rnorm(300)
x2 = rnorm(300)
lr1 = x1^2 + x2^2
y = (lr1 > mean(lr1)) * 2 - 1
X = cbind(x1, x2)
dat1 = data.frame(y = factor(y), x1 = x1, x2 = x2)
library(e1071)
svm.model = svm(y ~ x1 + x2, data = dat1, kernel = "polynomial", scale = FALSE,
type = "C-classification", gamma = 0.5, coef0 = 1, degree = 2, cost = 1e5)
plot(svm.model, dat1)
predict_kernel_1 <- function(SV, coefs, rho, new.X){
result_0 <- rowSums(sapply(1:nrow(SV),
function (i) {
X.DOT = SV[i,]%*%t(SV)
#polynomial kernel function (gamma = 0.5; coef0 = 1; degree = 2)
Kernel.X.DOT = (0.5*X.DOT + 1)^2
coefs[i]*Kernel.X.DOT
}))
result_1 <- rowSums(sapply(1:nrow(SV),
function (i) {
X.DOT = SV[i,]%*%t(new.X)
#polynomial kernel function (gamma = 0.5; coef0 = 1; degree = 2)
Kernel.X.DOT = (0.5*X.DOT + 1)^2
coefs[i]*Kernel.X.DOT
}))
b <- -rho
return(result_1+b)
}
my_decision.values = predict_kernel_1(svm.model$SV, svm.model$coefs, svm.model$rho, X)
all.equal(as.numeric(svm.model$decision.values), my_decision.values)
## [1] TRUE
set.seed(0)
x1 = rnorm(300)
x2 = rnorm(300)
lr1 = x1^2 + x2^2
y = (lr1 > mean(lr1)) * 2 - 1
dat1 = data.frame(y = factor(y), x1 = x1, x2 = x2)
svm.model = svm(y~., data = dat1, kernel = "polynomial", scale = FALSE, degree = 3, type = "C-classification", cost = 1e5)
plot(svm.model, dat1)
pred.y3 = predict(svm.model, dat1)
tab3 = table(pred.y3, y)
print(tab3)
## y
## pred.y3 -1 1
## -1 138 46
## 1 49 67
– 但目前就我們對「Kernel function」的理解,我們發現基本上邏輯斯回歸能做出幾乎一樣的結果,只要你手動擴增維度即可,這樣SVM好像也沒多厲害。
– 為了讓SVM達到邏輯斯回歸達不到的境界,我們有個瘋狂的想法,想要把維度擴增到「無限多維」,這樣邏輯斯回歸不就不可能做到了?
– 有的,那就是「radial basis kernel function」,有些人稱他為「gaussian kernel function」。
– 乍看之下是無限多維,但gamma參數若設的很小,後面幾項衰減的速度非常快,因此幾乎等同於在「低維」空間內進行分類;但反過來說,若gamma參數設的非常非常大,那這就真的相當於在「無限多維」空間中進行線性分割,那會造成什麼下場?
– 但現實世界的資料中,一定存在否些測量誤差,我們充分了解將其投影到「無限多維」後絕對能完美分割,但這是否反而會降低泛化能力?
– 統計學基本定理:若維度數目超過樣本數,則必定存在一個完全可分割的超平面能完美分割資料
– 我們要小心「過度擬和」的危險
data(iris)
set.seed(0)
train.sample = sample(1:150, 90)
train.iris = iris[train.sample,]
test.iris = iris[-train.sample,]
svm.model_1 = svm(Species~., data = train.iris, kernel = "radial", gamma = 1e3, cost = 1e5)
pred.y1 = predict(svm.model_1, test.iris)
tab1 = table(pred.y1, test.iris[,5])
tab1
##
## pred.y1 setosa versicolor virginica
## setosa 0 0 0
## versicolor 21 19 19
## virginica 0 0 1
– 因此,我們不能允許SVM恣意的尋找太複雜的分割超平面,我們要能容忍SVM存在誤差。
– 畢竟我們有能力透過「radial basis kernel function」,等同於把資料投影到「無限多維」上進行分類,那我們一定要在最佳化式子中能容忍誤差的存在,否則答案必定會過度擬和
– 因此,實際上完整的的SVM最佳化目標並非我們前面的式子,而是…
– 注意參數「cost」,他就是用來強調若分類錯誤,則要付出多少代價,數字越大代表要得代價越大,SVM會盡可能找出完美分割資料的超平面,但現實世界問題幾乎不可能存在完美解法,誤差肯定存在,因此我們現在必須要給定一個適當的「cost」,避免他找出過於複雜的分割超平面
– 看到這個式子後就會明白,當參數C(也就是套件中cost)設的非常非常大時,那最佳化的過程中必定會讓所有的ξ都非常接近0,那就等於我們原始的式子了,因此我們前面的範例中在與自己的svm進行比較時,都將「cost」設的非常大。
svm.model_2 = svm(Species~., data = train.iris, kernel = "radial", gamma = 1e3, cost = 1)
pred.y2 = predict(svm.model_2, test.iris)
tab2 = table(pred.y2, test.iris[,5])
tab2
##
## pred.y2 setosa versicolor virginica
## setosa 0 0 0
## versicolor 21 19 19
## virginica 0 0 1
svm.model_3 = svm(Species~., data = train.iris, kernel = "radial", gamma = 1, cost = 1)
pred.y3 = predict(svm.model_3, test.iris)
tab3 = table(pred.y3, test.iris[,5])
tab3
##
## pred.y3 setosa versicolor virginica
## setosa 20 0 0
## versicolor 0 18 1
## virginica 1 1 19
– 函數「tune」所找尋最佳參數的方式是透過交叉驗證
data(iris)
set.seed(0)
train.sample = sample(1:150, 90)
train.iris = iris[train.sample,]
test.iris = iris[-train.sample,]
svm_tune <- tune(svm, train.x = train.iris[,-5], train.y = train.iris[,5],
kernel = "radial",
ranges = list(cost = 2^(-3:3), gamma = 2^(-3:3)))
print(svm_tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 8 0.125
##
## - best performance: 0.02222222
best.svm.model = svm(Species~., data = train.iris, kernel = "radial",
cost = svm_tune$best.parameters[1],
gamma = svm_tune$best.parameters[2])
best.pred.y = predict(best.svm.model, test.iris)
best.tab = table(best.pred.y, test.iris[,5])
best.tab
##
## best.pred.y setosa versicolor virginica
## setosa 21 0 0
## versicolor 0 17 0
## virginica 0 2 20
– 請在這裡下載手寫數字資料
– 你可能需要運算非常久才能得到答案,建議你最開始不要使用太多樣本!
SVM-Kernel: polynomial
cost: 1
degree: 5
gamma: 0.00127551
coef0: 0
– 但他畢竟還是在維度擴增的基礎上進行分類,即便增加了參數「cost」避免「過度擬和」,但這中間的拿捏需要大量的測試。
– 一個比較簡單的方式是與Random Forest結合,利用Random Forest僅使用少數變數的特性進行特徵篩選,再利用SVM進行維度擴增做最佳化分類
– 另外有一個專門做圖像特徵萃取的方法:方向梯度直方圖,2005年由Navneet Dalal與Bill Triggs所發表之Histograms of Oriented Gradients for Human Detection,至目前為止已被引用超過2萬次
結構化數據的預測,到目前為止仍然以支持向量機做為最主要的預測工具!
但在非結構化數據的領域,像是圖像辨識/語音辨識/文字探勘等,人工神經網路在2012年以後已逐漸取代了支持向量機的地位,從下一節課開始,我們將開始介紹人工神經網路的課程,我們將逐步介紹人工神經網路的原理、特殊型態、近代發展,從而了解為什麼人工神經網路打敗了支持向量機